In [2]:
import pandas as pd
#pd.set_option('mode.sim_interactive', True)
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import json, csv
import re

In [10]:
df = pd.read_csv('/Users/Peter/Documents/atlas/atlasdata/data.xls.csv', encoding='latin1')
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#df.dtypes

In [12]:
#ADD SOME COLUMNS and DROP OTHERS
#df = df[df['id']!='geometry']
df['obs_type'] = ""
df['newurl'] =""
#df = df.drop('lon', axis=1)
#df = df.drop('marker', axis=1)
#df['group'] = df.id.str[0:6]  #str function slices string
#df.type  #although type is a keyword this works


Out[12]:
(6312, 20)

Using APPLY to classify observation types according to keywords or strings in certain fields


In [79]:
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#analysis_terms = ['analysis', 'density', 'average','variance','pH']
def mymunge(row): # a VARIETY OF CRITERIA
    if any( ['along' in row['note'], 'transect' in row['type'], 'grade' in row['type'], 'line' in row['type'], 'map' in row['type'], 'plot' in row['type'], 'remonitor' in row['type'], 'plants' in row['type']] ):
        return 'transect'
    if any( ['vert' in row['type'], 'ang' in row['type'], 'step back' in row['note'], 'hoop' in row['note'], 'Hoop' in row['note'], 'bare' in row['label1']] ):
        return 'cover'
    if any( ['infiltracion' in row['type'], 'infiltration' in row['type']] ):
        return 'infiltration'
    if any( ['analysis' in row['type'], 'composited' in row['type'], 'misc' in row['type'], 'density' in row['type'],'variance' in row['type'], 'average' in row['type'], 'pH' in row['type']] ):
        return 'analysis'
    if any( ['photo' in row['type']] ):
        return 'photo'
    if any( ['change' in row['type']] ):
        return 'change'
    if any( ['brix' in row['type'], 'biomass' in row['type'], 'clip' in row['type']] ):
        return 'food_analysis'
    
    
df['obs_type'] = df.apply(mymunge, axis=1)
#SEE HOW WE DID
print(len(df))
print(df.obs_type.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))


6312
analysis         2256
transect         1513
cover            1341
infiltration     1065
photo             112
food_analysis      13
NaN                12
Name: obs_type, dtype: int64

In [14]:
df = df.replace(np.nan,'', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE

def buildValues(row): # build the values json field
    v = '{"observer": "' + row['observer'] \
    + '", "label1": "' + row['label1'] \
    + '", "value1": "' + row['value1'] \
    + '", "label2": "' + row['label2'] \
    + '", "value2": "' + row['value2'] \
    + '", "label3": "' + row['label3'] \
    + '", "value3": "' + row['value3'] \
    + '", "start_date": "' + row['start_date'] \
    + '", "end_date": "' + row['end_date'] \
    + '", "description": "' + row['description'] \
    + '", "photo1": "' + row['photo1'] \
    + '", "photo2": "' + row['photo2'] \
    + '"}'
    return v
    
    
df['values'] = df.apply(buildValues, axis=1)
#SEE HOW WE DID
#print(len(df))
#print(df.obs_type.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))
print df['values'][50]


{"observer": "", "label1": "tons C per ha per yr", "value1": "1.17", "label2": "", "value2": "", "label3": "", "value3": "", "start_date": "", "end_date": "", "description": "<p>As reported in <a href="http://www.researchgate.net/publication/272376062_Tillage_and_Cover_Cropping_Affect_Crop_Yields_and_Soil_Carbon_in_the_San_Joaquin_Valley_California">Agronomy Journal</a> (01/2015; DOI: 10.2134/agronj14.0415), reduced tillage with cover cropping trials with cotton and tomatoes in the San Joaquin valley resulted in increased soil carbon in the top 30 cm from 1999 to 2007. Conventional till and no cover cropping also showed slight increases.</p>", "photo1": "", "photo2": ""}

In [15]:
print df['values'][0:10]


0    {"observer": "", "label1": "cm of peat per yea...
1    {"observer": "", "label1": "cm of peat per yea...
2    {"observer": "", "label1": "No-till with cover...
3    {"observer": "Jeff Creque", "label1": "percent...
4    {"observer": "", "label1": "percent increase i...
5    {"observer": "", "label1": "percent increase i...
6    {"observer": "", "label1": "percent organic ma...
7    {"observer": "", "label1": "percent organic ma...
8    {"observer": "", "label1": "percent organic ma...
9    {"observer": "", "label1": "percent per year i...
Name: values, dtype: object

In [62]:
#df[df['obs_type'].isnull()]

In [117]:
df = pd.read_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeREADYG.csv')
#print df.to_json(orient='records')

In [108]:
df.to_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeREADYH.csv', index=False)

In [131]:
def buildJSON(row): # a VARIETY OF CRITERIA
    v=''
    v += 'Observation(values=' + row['values']
    v += ', observer_id=' + str(row['observer_id'])
    v += ', site_id=' + str(row['site_id'])
    v += ', type_id=' + str(row['type_id'])
    v += ').save()'
    return v
    
df['json'] = df.apply(buildJSON, axis=1)
df['json'].to_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeK.csv', index=False)

In [33]:
linephotos = df[(df.type.str.contains('line'))]
angphotos = df[(df.type.str.contains('ang')) | (df.note.str.contains('step back'))]
vertphotos = df[df.type.str.contains('vert')]
len(vertphotos)
#re.findall('\d+', s) #finds digits in s
def get_num(x):
    digits = ''.join(ele for ele in x if ele.isdigit())
    if digits:
        return int(digits)
    pass

#get_num('Hoop 1, 125\'')
#df.ix[459]
for y in range(len(df)):


Out[33]:
6312

In [63]:
#basic row selection from http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/ 
#which has GREAT info on joins
peter_plots = df[(df.lat > 0) & (df.observer.str.contains('Peter Donovan'))]
features = df[(df.type == '*plot summary')|(df.type == 'change')|(df.type.str.contains('remonitor'))];
#df.iloc[100] and df.ix[100] get the row referred to by the default 0-based index
# df.loc['Kellogg LTER'] doesn't work because it's not an index; 
# dfnew = df.set_index('id'); this works even tho id is not unique
#dfnew.loc['Kellogg LTER'] and this works; use inplace=True as arg to modify existing df
# dfnew.loc['BURR1'] returns all rows for this index

#column selector
#df[['type','label3']]; need to use double [] to enclose a list
#new column
#df['new'] = df.lat + 2

In [58]:
#subsetting dataframes by finding rows

change = df[(df['type'] == "change")]
plots = df[df['type']=="*plot summary"]
peter = df[df.observer == "Peter Donovan"]
north = df[df.lat > 49] # gives 19 but df.lat > 49 gives all rows
type(change)


Out[58]:
pandas.core.frame.DataFrame

In [89]:
food_analysis = df[(df['obs_type'] == "food_analysis")]
food_analysis.to_csv('/Users/Peter/Documents/scc/challenge/obs_types/food_analysis.csv', index=False)